Data Science Case Study#

Let’s load the data, and gain some insights and generate ideas for analysis.

Hide code cell source
# Import packages
import pandas as pd
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import pandas as pd
import datetime as dt

# Load data
transactions = pd.read_csv('/Users/maxandchang/Documents/jobs/Appostrophe/case-study-transactions.csv')
installs = pd.read_csv('/Users/maxandchang/Documents/jobs/Appostrophe/case-study-installs.csv')

# Convert transaction_date and install_date to datetime
transactions['transaction_date'] = pd.to_datetime(transactions['transaction_date'])
installs['install_date'] = pd.to_datetime(installs['install_date'])

# Convert 'segment' column to string data type and add 'segment' in front of each value
installs['segment'] = 'segment' + installs['segment'].astype(str)

# Get unique user_ids from transactions
trans_id = transactions['user_id'].unique()

# Check if 'user_id' in 'installs' is found in 'trans_id'
installs['purchase_user'] = installs['user_id'].isin(trans_id).map({True: 'Yes', False: 'No'})

# set figure width
figure_width = 770

Install data review#

Hide code cell source
# Calculate percentage of 'Yes' under 'purchase' grouped by 'segment'
purchase_percentage = installs.groupby('segment')['purchase_user'].value_counts(normalize=True).mul(100)

# Define the color palette
color_palette = ['rgb(31, 119, 180)', 'rgb(255, 127, 14)', 'rgb(44, 160, 44)', 'rgb(214, 39, 40)', 
                 'rgb(148, 103, 189)', 'rgb(140, 86, 75)', 'rgb(227, 119, 194)', 'rgb(127, 127, 127)', 
                 'rgb(188, 189, 34)']

# Create the histogram with the specified color palette
fig_installs = px.histogram(installs, x="install_date", color='segment', opacity=0.7,
                            color_discrete_sequence=color_palette,
                            pattern_shape="purchase_user",
                            pattern_shape_sequence=["", "x"])

# Show the histogram
fig_installs.update_layout(bargap=0.1, title='Install distribution over time', width=figure_width,
                           title_font=dict(size=18),yaxis_title='User Count',
                           annotations=[
        dict(
            text='Users of purchase including free trials are marked by x',  # Text for the subtitle
            x=-0.01,  # X position (0 to 1)
            y=1.06,  # Y position (0 to 1)
            xref='paper',  # Reference point for x position (default is 'paper')
            yref='paper',  # Reference point for y position (default is 'paper')
            showarrow=False,  # Disable arrow
            font=dict(size=16, color='grey')  # Adjust the font size as needed
        )
    ])
fig_installs.update_traces(marker_pattern_size=5, selector=dict(type='histogram'))
fig_installs.show()
  • Each day, there are approximately 5000 installs from September 1-29, 2023.

  • Installs are divided as segment0, segment1, and segment2.

  • The converion rates from installs to free-trials are respectively 10.120%, 9.997%, and 9.796% for segment0, segment1, and segment2.

Transaction data review#

Combine the installs and transactions, and remove the installs who never try the app.

Hide code cell source
## Fill in 0 when it is a free-trial
transactions['price_in_usd'] = transactions['price_in_usd'].fillna(0)
# Merge dataframes on user_id keeping all rows
data_merge = pd.merge(installs, transactions, on='user_id', how='inner')

# Create a new column 'status' by combining the first letters from 'is_free_trial' and 'is_conversion_to_pay'
data_merge['status'] = data_merge['is_free_trial'].astype(str).str[0] + \
                        data_merge['is_conversion_to_pay'].astype(str).str[0] + \
                        data_merge['price_in_usd'].astype(str)

# Count the occurrences of each user_id
user_id_counts = data_merge['user_id'].value_counts()

# Create a new column based on the condition
data_merge['payment_user'] = data_merge['user_id'].map(user_id_counts).apply(lambda x: 'No' if x == 1 else 'Yes')

# Add a assisting column as re_id 
# Remove duplicates based on 'user_id' column and keep the first appearance
data_merge_unique = data_merge.drop_duplicates(subset=['user_id'], keep='first')


# Sort the dataframe by 'segment' and 'transaction_date'
data_merge_unique_sorted = data_merge_unique.sort_values(by=['segment', 'transaction_date'])

## Re id
data_merge_unique_sorted['re_id'] = data_merge_unique_sorted.groupby('segment')['transaction_date'].rank(method='first')

# data_merge_unique_sorted['re_id'] = data_merge_unique_sorted['re_id'].astype(str)
data_merge_unique_sorted = data_merge_unique_sorted[['user_id', 're_id']]

data_merge2 = pd.merge(data_merge, data_merge_unique_sorted, on='user_id', how='left')

data_merge2['re_id'] = data_merge2['re_id'].astype(int)

## Group by 'user_id'
grouped = data_merge2.groupby('user_id')
# Define a function to calculate the day difference
start_date = dt.datetime(2023,9,1)
today_date = dt.datetime(2024,3,25)
def calculate_week(df):
    df['week'] = (df['transaction_date'] - start_date).dt.days // 7
    return df

# Apply the function to each group
data_merge2 = grouped.apply(calculate_week)
data_merge2['week'] = data_merge2['week'] + 1

# Calculate the number of transaction opportunities
data_merge2['n_weeks'] = (( today_date - data_merge2['install_date']).dt.days // 7) + 1

##
# data_merge_payment = data_merge2[data_merge2['payment_user'] == 'Yes']
data_merge_payment = data_merge2[data_merge2['week'] < 31] # Can use 20 weeks as hyperparameter calibration firstly in the futher analysis.

##
# Generate assiting rows to show transaction behavious of each user during week 1- 30
unique_users = data_merge_payment[['user_id', 're_id', 'segment']].drop_duplicates()
assist28 = pd.DataFrame({'week': range(1,31)})
assist28_ = pd.merge(unique_users, assist28, how="cross")
data_merge_payment_week = pd.merge(data_merge_payment,assist28_, how='outer')
# Sort the resulting DataFrame by 'user_id'
data_merge_payment_week = data_merge_payment_week.sort_values(by=['user_id','week'])

##
import numpy as np

# Define conditions and values
conditions1 = [
    (data_merge_payment_week['segment'] == 'segment0') & (data_merge_payment_week['payment_user'] == 'Yes'), 
    (data_merge_payment_week['segment'] == 'segment1') & (data_merge_payment_week['payment_user'] == 'Yes'),
    (data_merge_payment_week['segment'] == 'segment2') & (data_merge_payment_week['payment_user'] == 'Yes'),
    data_merge_payment_week['status'].isna()
]
values1 = [0, 1, 2, 3]

# Apply conditions and assign values to the new column
data_merge_payment_week['statusH'] = np.select(conditions1, values1, default=np.nan)

conditions2 = [
    data_merge_payment_week['status'] == 'TF0.0', 
    data_merge_payment_week['status'] == 'FT2.99',
    data_merge_payment_week['status'] == 'FF2.99',
    data_merge_payment_week['status'].isna()
]
values2 = [0, 1, 2, 3]

# Apply conditions and assign values to the new column
data_merge_payment_week['statusT'] = np.select(conditions2, values2, default=np.nan)

## Reshape data for visulisation
dmpw = data_merge_payment_week[['statusT','week','user_id','segment']]
df_pay_free_0 = dmpw[dmpw['segment'] == 'segment0']
df_pay_free_1 = dmpw[dmpw['segment'] == 'segment1']
df_pay_free_2 = dmpw[dmpw['segment'] == 'segment2']
df_p_f_0_wide = df_pay_free_0.pivot(index="week", columns="user_id", values="statusT")
df_p_f_1_wide = df_pay_free_1.pivot(index="week", columns="user_id", values="statusT")
df_p_f_2_wide = df_pay_free_2.pivot(index="week", columns="user_id", values="statusT")

##
data_merge_payment_unique = data_merge_payment.drop_duplicates(subset=['user_id'], keep='first')
user_number_segment= data_merge_payment_unique['segment'].value_counts()

segment_user_id_counts = data_merge_payment.groupby('segment')['user_id'].value_counts()

# Calculate the number of unique user_id values that appear only once in each segment group
pay_number_segment = segment_user_id_counts.groupby(level=0).apply(lambda x: (x != 1).sum())
# print(user_number_segment)
# print(pay_number_segment)
# pay_number_segment / user_number_segment * 100
Hide code cell source
colorscale = [
    [0, 'rgba(31, 119, 180, 0.7)'],
    [0.25, 'rgba(31, 119, 180, 0.7)'],
    [0.25, 'rgba(255, 127, 14, 0.7)'],
    [0.5, 'rgba(255, 127, 14, 0.7)'],
    [0.5, 'rgba(44, 160, 44, 0.7)'],
    [0.75, 'rgba(44, 160, 44, .7)'],
    [0.75, 'rgba(255, 255, 255, .7)'],
    [1, 'rgba(255, 255, 255, .7)']
]
# Define custom tick labels for the y-axis
colorbar_tickvals = [0.375, 1.125, 1.875]
colorbar_ticktext = ['TF0.0', 'FT2.99', 'FF2.99']
height_each = 250

# segment0
fig0 = px.imshow(df_p_f_0_wide, aspect="auto")
fig0.update_traces(opacity=0.7)
fig0.update_layout(title="Behaviour history of users", width=figure_width,
                   margin=dict(l=2, r=2, t=30, b=0),height=height_each, xaxis_title=None,
    coloraxis=dict(colorscale=colorscale,
                                 colorbar=dict(tickvals=colorbar_tickvals,thickness=25, 
                                               ticktext=colorbar_ticktext)))
fig0.update_yaxes(autorange=True, title='segment0<br>Week')
fig0.update_xaxes(showticklabels=False)
# segment1
fig1 = px.imshow(df_p_f_1_wide, aspect="auto")
fig1.update_traces(opacity=0.7)
fig1.update_layout(margin=dict(l=2, r=2, t=0, b=0), height=height_each, xaxis_title=None, width=figure_width,
    coloraxis=dict(colorscale=colorscale,
                                 colorbar=dict(tickvals=colorbar_tickvals,thickness=25, 
                                               ticktext=colorbar_ticktext)))
fig1.update_yaxes(autorange=True, title='segment1<br>Week') 
fig1.update_xaxes(showticklabels=False)

# segment2
fig2 = px.imshow(df_p_f_2_wide, aspect="auto")
fig2.update_traces(opacity=0.7)
fig2.update_layout(margin=dict(l=2, r=2, t=0, b=2), height=height_each, width=figure_width,
                   coloraxis=dict(colorscale=colorscale,
                                 colorbar=dict(tickvals=colorbar_tickvals,thickness=25, 
                                               ticktext=colorbar_ticktext)))
fig2.update_yaxes(autorange=True, title='segment2<br>Week') 

fig0.show()
fig1.show()
fig2.show()
  • There are only three kinds of transactions:

    • TF0.00: is_free_trial is true is_conversion_to_pay is false, with 0.00 income →→→ the person starts a free trial

    • FT2.99: is_free_trial is false is_conversion_to_pay is true , with 2.99 income →→→ the persons starts to pay after a free trial

    • FF2.99: is_free_trial is false is_conversion_to_pay is false, with 2.99 income →→→ the person keeps paying

  • All customers start from a free trial.

  • 70.23% of free-trial users start their free-trials on the same day as intalling.

  • 30 weeks of shoping opportunities from 2023-09-01 to 2024-03-25 are monitored.

Hide code cell source
# plot the free-trial users vs payment users
fig_transaction = px.histogram(data_merge_unique, x='segment', color='segment', opacity=0.7,
                            color_discrete_sequence=color_palette,
                            category_orders={"segment": ["segment0", "segment1", "segment2"]},
                            pattern_shape="payment_user",
                            pattern_shape_sequence=["", "x"], #text_auto=True
        )
fig_transaction.update_layout(width=figure_width,
        title_text='Free-trial users Vs Payment users', # title of plot
        yaxis_title_text='User Count'
)
fig_transaction.add_annotation(x='segment0', y=5300, text="49.94%", showarrow=False)
fig_transaction.add_annotation(x='segment1', y=5300, text="49.92%", showarrow=False)
fig_transaction.add_annotation(x='segment2', y=5300, text="49.40%", showarrow=False)
fig_transaction.show()
  • 2536 out of 5078 free-trial users are converted to payment users for segment0, trial conversion rate is 49.94%.

  • 2504 out of 5016 free-trial users are converted to payment users for segment1, trial conversion rate is 49.92%.

  • 2403 out of 4864 free-trial users are converted to payment users for segment2, trial conversion rate is 49.40%.

  • Trial conversion rates for three segments are pretty similar.

Hide code cell source
fig_week = px.histogram(data_merge_payment,x='week',color='segment', facet_row='segment', 
                        color_discrete_sequence=color_palette,
                        category_orders={"segment": ["segment0", "segment1", "segment2"]})
fig_week.update_layout(title='Accumulated purchasing frequency over time', width=figure_width)
fig_week.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig_week.show()
  • The shopping frequency reach maximum values at week 4 for all three segments.

  • Three groups of users share similar purchasing pattern.

Regular metrics#

Retention Rate#

Percentage of users who remained active over time.
retention rate = (CE - CN)/CS
CE : number of customers at the end the week.
CN : number of new customers during the week.
CS : number of customers when the week begins.

Hide code cell source
retention_rate = []
for n in range(1, 30):
    for segment, segment_data in data_merge2.groupby('segment'):
        CE_user_id = segment_data[segment_data['week'] == n + 1]['user_id'].unique()
        CE = len(CE_user_id)

        CS_user_id = segment_data[segment_data['week'] == n]['user_id'].unique()
        CS = len(CS_user_id)

        # Find the intersection of CE_user_id and CS_user_id
        common_user_ids = set(CE_user_id) & set(CS_user_id)

        # Remove common elements from CE_user_id
        CN_user_id = [user_id for user_id in CE_user_id if user_id not in common_user_ids]

        CN = len(CN_user_id)

        # Append retention rate for each segment to the list
        retention_rate.append({'segment': segment, 'week': n, 'retention_rate': (CE - CN) / CS})

# Convert the list of dictionaries to a DataFrame
retention_rate_df = pd.DataFrame(retention_rate)
Hide code cell source
fig_retention_rate = px.line(retention_rate_df, x='week', y= 'retention_rate',color='segment', symbol='segment',
                             color_discrete_sequence=color_palette, line_dash='segment',
                            category_orders={"segment": ["segment0", "segment1", "segment2"]})
fig_retention_rate.update_layout(title='Weekly retention rates from 2023-09-01 (week1) ',width=figure_width)
fig_retention_rate.show()
Hide code cell source
fig_r2 = px.histogram(retention_rate_df,x='retention_rate',color='segment', barmode='group',
                      color_discrete_sequence=color_palette, 
                            category_orders={"segment": ["segment0", "segment1", "segment2"]})
fig_r2.update_layout(title='Retention rate distributions',width=figure_width)
fig_r2.show()
  • There is no clear disadvantages or advantages when comparing retention rates every passing week.

  • The peak values of retention rates of segment0 and segment2 are more skewed to bigger values.

  • But segment1 seems to be more stable and secured with higher rates.

  • Sum the retention rate over 30 weeks by segments gives:

    • segment0: 16.050889

    • segment1: 16.379672

    • segment2: 16.016881

ARPU#

ARPU represents average revenue per user.

Hide code cell source
# totalRevenue
revenue = data_merge2[['segment','price_in_usd']]
revenue = revenue.groupby(by=['segment']).sum()
revenue.rename(columns={'price_in_usd':'totalRevenue'}, inplace=True)

# totalInstallUsers
install_users = installs[['user_id','segment']]
install_users = install_users.drop_duplicates(subset=['user_id'])
install_users.rename(columns={'user_id':'totalInstallUsers'}, inplace=True)
install_users = install_users.groupby(by=['segment']).count()

# totalPayingUsers and totalFreeTrialUsers
# Group by 'segment' and 'status' and count the occurrences
freetrial_paying_users = data_merge2.groupby(['segment', 'status']).size().reset_index(name='count')

# Unstack the 'status' column
freetrial_paying_users = freetrial_paying_users.pivot(index='segment', columns='status', values='count')
freetrial_paying_users = freetrial_paying_users[['FT2.99', 'TF0.0']]
# Rename the columns for clarity
freetrial_paying_users.rename(columns={'FT2.99': 'totalPayingUsers', 'TF0.0': 'totalFreeTrialUsers'}, inplace=True)

# arpu
arpu = pd.concat([revenue, install_users, freetrial_paying_users], axis=1)
# Rename the columns for clarity
# arpu.rename(columns={'price_in_usd': 'totalRevenue', 'user_id_free_trial': 'totalFreeTrialUsers', 
#                      'user_id_installs':'totalInstallUsers'}, inplace=True)
arpu['arpuByFreeTrial'] = arpu['totalRevenue'] / arpu['totalFreeTrialUsers']
arpu['arpuByInstall'] = arpu['totalRevenue'] / arpu['totalInstallUsers']
arpu['arppu'] = arpu['totalRevenue'] / arpu['totalPayingUsers']

arpu.style.apply(lambda x: ['background-color: rgba(255, 127, 14, 0.2)' if value == x.max() else '' for value in x], axis=0)\
          .set_properties(**{'font-size': '8pt'})\
        .set_table_styles([
    {'selector': 'td', 'props': 'font-size: 0.7em;'},
    {'selector': 'th', 'props': 'font-size: 0.7em;'},
], overwrite=False)
  totalRevenue totalInstallUsers totalPayingUsers totalFreeTrialUsers arpuByFreeTrial arpuByInstall arppu
segment              
segment0 26267.150000 50177 2536 5078 5.172735 0.523490 10.357709
segment1 25863.500000 50174 2504 5016 5.156200 0.515476 10.328874
segment2 25193.740000 49649 2403 4864 5.179634 0.507437 10.484286

LTV#

Hide code cell source
import datetime as dt
from lifetimes import BetaGeoBetaBinomFitter

# Divide data by segments
df = data_merge_payment[['user_id','segment','transaction_date','status','week', 'price_in_usd', 'n_weeks']]
segment0 = df[df['segment'] == 'segment0']
segment1 = df[df['segment'] == 'segment1']
segment2 = df[df['segment'] == 'segment2']
Hide code cell source
# Define a function to caluclate parameters for the GB/BB models

def gbbb_metrics(transaction_data):

    rtfm = transaction_data.groupby(by='user_id').agg({
        'transaction_date': [
            lambda x: (x.max() - x.min()).days / 7,  # recency
            lambda x: x.nunique() - 1  # frequency
        ],
        'price_in_usd': lambda x: x.sum(),  # monetary
        'n_weeks': lambda x: x.max() # install_date

    })

    rtfm.columns = rtfm.columns.droplevel(0)

    rtfm.columns = ['recency', 'frequency', 'monetary','n']
    rtfm['monetary'] = rtfm['monetary'] / (rtfm['frequency'] + 1)
    return rtfm

Calculate BG/BB model inputs#

Hide code cell source
rtfm0 = gbbb_metrics(segment0)
rtfm1 = gbbb_metrics(segment1)
rtfm2 = gbbb_metrics(segment2)

print("Segment 0:")
print(rtfm0.head())

print("\nSegment 1:")
print(rtfm1.head())

print("\nSegment 2:")
print(rtfm2.head())
Segment 0:
         recency  frequency  monetary   n
user_id                                  
10           1.0          1  1.495000  27
46           5.0          5  2.491667  27
62           6.0          6  2.562857  27
88           1.0          1  1.495000  28
144         10.0          6  2.562857  30

Segment 1:
         recency  frequency  monetary   n
user_id                                  
5            0.0          0  0.000000  29
22           8.0          5  2.491667  30
48           1.0          1  1.495000  28
50           1.0          1  1.495000  30
70           0.0          0  0.000000  27

Segment 2:
         recency  frequency  monetary   n
user_id                                  
34           2.0          2  1.993333  27
40           0.0          0  0.000000  28
58           0.0          0  0.000000  29
59          16.0          9  2.691000  28
61           0.0          0  0.000000  28

Model evaluation#

Hide code cell source
penalizer_coef = 0.001
bgbb0 = BetaGeoBetaBinomFitter(penalizer_coef=penalizer_coef)
bgbb0.fit(frequency=rtfm0['frequency'], recency=rtfm0['recency'], n_periods =rtfm0['n'])

bgbb1 = BetaGeoBetaBinomFitter(penalizer_coef=penalizer_coef)
bgbb1.fit(frequency=rtfm1['frequency'], recency=rtfm1['recency'], n_periods =rtfm1['n'])

bgbb2 = BetaGeoBetaBinomFitter(penalizer_coef=penalizer_coef)
bgbb2.fit(frequency=rtfm2['frequency'], recency=rtfm2['recency'], n_periods =rtfm2['n'])

print('bgbb0:',bgbb0)
print('bgbb1:',bgbb1)
print('bgbb2:',bgbb2)
bgbb0: <lifetimes.BetaGeoBetaBinomFitter: fitted with 5078 subjects, alpha: 3.60, beta: 1.22, delta: 1.95, gamma: 1.66>
bgbb1: <lifetimes.BetaGeoBetaBinomFitter: fitted with 5016 subjects, alpha: 3.60, beta: 1.22, delta: 1.97, gamma: 1.67>
bgbb2: <lifetimes.BetaGeoBetaBinomFitter: fitted with 4864 subjects, alpha: 3.63, beta: 1.28, delta: 1.90, gamma: 1.62>
Hide code cell source
segments = [rtfm0, rtfm1, rtfm2]
bgbbs = [bgbb0, bgbb1, bgbb2]
segment_names = ['segment0', 'segment1', 'segment2']

df_e_n_t_b = pd.DataFrame(columns=['frequency', 'users', 'segment', 'type'])

for bgbb, segment_name, segment in zip(bgbbs, segment_names, segments):
    models = bgbb.expected_number_of_transactions_in_first_n_periods(30).reset_index()
    actual = segment[['frequency']].reset_index().groupby('frequency').count().reset_index()
    # Adding the 'segment' and 'type' column
    models['segment'] = segment_name
    models['type'] = 'BG/BB'
    actual['segment'] = segment_name
    actual['type'] = 'Actual'
    # rename
    models = models.rename(columns={'model': 'users'})
    actual = actual.rename(columns={'user_id': 'users'})

    results = pd.concat([models,actual])

    df_e_n_t_b = pd.concat([df_e_n_t_b, results], ignore_index=True)

## Plot
fig_e_n_t_b = px.bar(df_e_n_t_b, x = 'frequency', y ='users', facet_row='segment', color='type',  barmode='group')
fig_e_n_t_b.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig_e_n_t_b.update_layout(title='Frequency Vs user counts for Actual and BG/BB estimation',width=figure_width)
fig_e_n_t_b.show()
  • The model results align with the actual measurements

  • This servers as a proof that the model is trustworthy to some degree.

Model prediction#

Hide code cell source
# loop into a dataframe for later plot
segments = [rtfm0, rtfm1, rtfm2]
bgbbs = [bgbb0, bgbb1, bgbb2]
segment_names = ['segment0', 'segment1', 'segment2']

df_c_e_n_p_f = pd.DataFrame(columns=['m_periods_in_future', 'n_purchases', 'segment'])

for segment, bgbb, segment_name in zip(segments, bgbbs, segment_names):
    for m_periods_in_future in range(1, 31):
        n_purchases = bgbb.conditional_expected_number_of_purchases_up_to_time(
            m_periods_in_future=m_periods_in_future, 
            frequency=segment['frequency'], 
            recency=segment['recency'], 
            n_periods=segment['n']
        )

        n_purchases_df = pd.DataFrame({
            'm_periods_in_future': m_periods_in_future,
            'n_purchases': n_purchases.values.mean(),
            'segment': segment_name
        }, index=[0])  # Setting index to [0] to avoid index duplication

        df_c_e_n_p_f = pd.concat([df_c_e_n_p_f, n_purchases_df], ignore_index=True)

# plot
fig_f_transactions = px.scatter(df_c_e_n_p_f,x='m_periods_in_future', y='n_purchases', color='segment', symbol='segment',
                                color_discrete_sequence=color_palette,
                            category_orders={"segment": ["segment0", "segment1", "segment2"]},)
fig_f_transactions.update_xaxes(title='m weeks in the future after 2024-03-25')
fig_f_transactions.update_yaxes(title='Expected average transactions (accumulated)')
fig_f_transactions.update_layout(title='Conditional expected transaction for future weeks', width=figure_width)
fig_f_transactions.show()
  • The current model suggests that segment2 has higher future transaction expectations..

  • However, the model hyperparameters have not been thoroughly estimated or cross-validated.

Average LTV#

Hide code cell source
## ltv before 30 weeks
ltv = data_merge_payment[['week', 'segment', 'price_in_usd']]
ltv_sum = ltv.groupby(by=['segment', 'week'])['price_in_usd'].sum().reset_index()

# Define a dictionary to store the number of segments for each segment
segments_counts = {'segment0': 5078, 'segment1': 5016, 'segment2': 4864}

# Apply division to calculate LTV per segment
ltv_sum['ltv'] = ltv_sum.apply(lambda row: row['price_in_usd'] / segments_counts[row['segment']], axis=1)
ltv_accumulated = ltv_sum.groupby('segment')['ltv'].cumsum()
ltv_sum['ltv_accumulated'] = ltv_accumulated
ltv_b = ltv_sum[['segment','week','ltv_accumulated']]

## ltv from 30 to 60
df_c_e_n_p_f['ltv'] = df_c_e_n_p_f['n_purchases'] * 2.99
# Define a dictionary to store the number of segments for each segment
segments_counts = {'segment0': 5.172735, 'segment1': 5.156200, 'segment2': 5.179634}
# Apply division to calculate LTV per segment
df_c_e_n_p_f['ltv_accumulated'] = df_c_e_n_p_f.apply(lambda row: row['ltv'] + segments_counts[row['segment']], axis=1)
df_c_e_n_p_f['m_periods_in_future'] += 30
df_c_e_n_p_f.rename(columns={'m_periods_in_future': 'week'}, inplace=True)

ltv_f = df_c_e_n_p_f[['segment','week','ltv_accumulated']]

#
combined_ltv = pd.concat([ltv_b, ltv_f], ignore_index=True)

# plot

fig_ltv = px.line(combined_ltv,x='week', y='ltv_accumulated', color='segment', 
                                color_discrete_sequence=color_palette, line_dash='segment',
                            category_orders={"segment": ["segment0", "segment1", "segment2"]},)
fig_ltv.update_xaxes(title='week')
fig_ltv.update_yaxes(title='Accumulated average LTV in usd')
fig_ltv.update_layout(title='Average LTV from week 1 to week 60', width=figure_width)
fig_ltv.show()
  • In the long run, segment2 gives a slightly higher LTV.

  • During the rapid increasing period (before week 20), segment0 provides slightly higher LTV.

Conclusion:
  • The metrics above reveal minimal differences among the three groups.

  • Segment0 demonstrates a slight edge in Conversion Rate and Trial Conversion Rate.

  • Segment1 exhibits a slight advantage in Retention Rate.

  • Segment2 shows a slight lead in ARPU, ARPPU, and LTV.

  • It appears that distinct strategies are optimal for different customer lifetimes.